import pyspark
from pyspark.sql import SparkSession
import pyspark.sql.functions as f
from pyspark.sql.functions import udf
from pyspark.sql.functions import col, split
from pyspark.sql.functions import array_contains
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import datetime
import geopandas as gpd
import descartes
import plotly as py
import plotly.graph_objs as go
import plotly.figure_factory as ff
import json
import nltk
from nltk.tokenize import word_tokenize
import collections
from collections import Counter
import re
from wordcloud import WordCloud, STOPWORDS
from PIL import Image
%matplotlib inline
spark = SparkSession.builder.getOrCreate()
sc = spark.sparkContext
restBizDF = spark.read.json('../SavedFiles/restBiz.json')
restReviewDF = spark.read.json('../SavedFiles/restReview.json')
restBizDF.show(5)
restReviewDF.show(5)
restReviewDF[['stars']].describe().show()
restReviewDF.groupBy().agg(f.expr('percentile_approx(stars, 0.5)').alias('med_val')).show()
star_counts = restReviewDF.groupBy('stars').count().toPandas()
star_counts = star_counts.sort_values('count')
star_counts
sns.set()
plt.figure(figsize=(8,5))
ax = plt.bar(star_counts['stars'],star_counts['count'], color = 'red')
plt.xlabel('Star Rating')
plt.ylabel('Frequency')
plt.title('Bar Plot of Star Ratings')
plt.savefig('../SavedFiles/ratingsBar')
sns.set()
plt.figure(figsize=(8,5))
ax = plt.pie(star_counts['count'], labels = star_counts['stars'], autopct='%1.1f%%')
plt.title('Pie Chart of Star Ratings')
plt.savefig('../SavedFiles/ratingsPie')
From the above bar chart and pie chart, we see that most of the reviews were 4- and 5-star reviews. There may be more highly-rated reviews because the restaurant's good. People also may want to reserve 1- or 2-star ratings for restaurants that are particurly bad. Another reason the ratings may tend to be more favorable is because people are less likely to go to bad restaurants, so there would in turn be fewer reviews.
rel_bizDF = restBizDF[['business_id','state']]
revs_with_state = restReviewDF.join(rel_bizDF, on = 'business_id', how='left_outer')
starsByState = revs_with_state.groupBy('state').agg(f.avg(revs_with_state.stars), f.count(revs_with_state.stars)).toPandas()
starsByState.sort_values('avg(stars)')
starsByState.shape[0]
starsByState = starsByState[starsByState['count(stars)']>10000]
fp = '../DownloadedFiles/us_can_shapefiles/ne_50m_admin_1_states_provinces_shp.shp'
map_df = gpd.read_file(fp)
map_df.head()
map_df.sr_adm0_a3.unique()
map_df = map_df[(map_df.sr_adm0_a3 == 'USA')|(map_df.sr_adm0_a3 == 'CAN')]
map_df.head()
from descartes.patch import PolygonPatch
map_df.plot()
merged_plot = map_df.set_index('postal').join(starsByState.set_index('state'))
merged_plot = merged_plot.fillna(1)
merged_plot.head()
merged_plot.geometry['MA']
variable = 'avg(stars)'
fig, ax = plt.subplots(1, figsize=(13,11))
merged_plot.plot(column = variable, vmin = 1, vmax = 5, cmap = 'Blues', legend= True,
linewidth = 0.8, ax=ax, edgecolor = '0.8')
merged_plot['coords'] = merged_plot['geometry'].apply(lambda x: x.representative_point().coords[:])
merged_plot['coords'] = [coords[0] for coords in merged_plot['coords']]
for idx, row in merged_plot.iterrows():
if row['avg(stars)']>1:
plt.annotate(s=row['iso_3166_2'][-2:], xy=row['coords'],horizontalalignment='center')
ax.axis('off')
plt.savefig('../SavedFiles/revsByStateMap1to5')
Of the states/provinces with more than 10,000 restaurant reviews in the dataset, we see that average rating does not appear to differ greatly from state to state. The lowest possible rating is 1 and the highest possible is 5, and we see on the above plot that it is difficult to distinguish between the shades of blue, however Nevada and Quebec seem to have higher ratings, while areas like Ontario and Illinois appear to have lower ratings, which is confirmed in the above dataframe (cell). The lowest average rating is 3.54 (Ontario) and the highest is 3.82 (Quebec). The map below which uses a color bar ranging from 3 to 4 instead, allows us to better visualize the slight differences in rating between the different states and provinces.
variable = 'avg(stars)'
fig, ax = plt.subplots(1, figsize=(13,11))
merged_plot.plot(column = variable, vmin = 3, vmax = 4, cmap = 'Blues', legend= True,
linewidth = 0.8, ax=ax, edgecolor = '0.8')
merged_plot['coords'] = merged_plot['geometry'].apply(lambda x: x.representative_point().coords[:])
merged_plot['coords'] = [coords[0] for coords in merged_plot['coords']]
for idx, row in merged_plot.iterrows():
if row['avg(stars)']>1:
plt.annotate(s=row['iso_3166_2'][-2:], xy=row['coords'],horizontalalignment='center')
ax.axis('off')
plt.savefig('../SavedFiles/revsByStateMap3to4')
fast_food = restBizDF.filter(array_contains(restBizDF.categoriesList, 'Fast Food'))
fast_food_id = fast_food.select('business_id').rdd.flatMap(lambda x: x).collect()
fast_food[['business_id','name','city','state','stars']].show(10)
fast_food_rev = restReviewDF.where(restReviewDF.business_id.isin(fast_food_id))
not_fast_rev = restReviewDF.where(~restReviewDF.business_id.isin(fast_food_id))
fast_food_rev.select('stars').describe().show()
not_fast_rev.select('stars').describe().show()
In the Yelp dataset, there are 220,507 reviews of fast food restaurants and 3,422,943 reviews of non-fast food restaurants. The average rating of fast food establishments (3.175 stars) is about 0.56 stars lower than the average rating of other restaurants (3.742 stars).
data = [
go.Box(y = fast_food_rev.select('stars').toPandas()['stars'], name = 'Fast Food'),
go.Box(y = not_fast_rev.select('stars').toPandas()['stars'], name = 'Other')
]
layout = go.Layout(title = 'Fast Food vs. Other Ratings Boxplots')
go.Figure(data=data, layout=layout).show()